<<<<<<< HEAD ======= >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337 00_all <<<<<<< HEAD ======= >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337 <<<<<<< HEAD ======= >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337 ======= >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337 <<<<<<< HEAD ======= >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

00_all

01_load

<<<<<<< HEAD
library("tidyverse")
Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
Warning: package 'purrr' was built under R version 4.5.2
Warning: package 'dplyr' was built under R version 4.5.2
Warning: package 'stringr' was built under R version 4.5.2
Warning: package 'forcats' was built under R version 4.5.2
Warning: package 'lubridate' was built under R version 4.5.2
raw_data <- read_csv("../data/_raw/SAA_Biospecimen_Analysis_Results_01Nov2025.csv")
write_tsv(raw_data, "../data/01_dat_load.tsv")

02_clean

library(tidyverse)

Loading the raw data

raw_data <- read_tsv("../data/01_dat_load.tsv")
raw_data |> 
  slice_sample(n = 10)
# A tibble: 10 × 48
    PATNO SEX    COHORT CLINICAL_EVENT TYPE        SAAMethod SAA_Status SAA_Type
    <dbl> <chr>  <chr>  <chr>          <chr>       <chr>     <chr>      <chr>   
 1   3166 Male   PD     V04            Cerebrospi… Amprion-… Positive   Type1   
 2   3630 Female PD     V06            Cerebrospi… Amprion-… Inconclus… Undeter…
 3  42378 Male   PD     BL             Cerebrospi… Amprion-… Positive   <NA>    
 4   3828 Male   PD     V04            Cerebrospi… Amprion-… Positive   Type1   
 5   4121 Female PD     V08            Cerebrospi… Amprion-… Positive   Undeter…
 6  54161 Male   PD     BL             Cerebrospi… Amprion-… Positive   <NA>    
 7   3510 Male   PD     BL             Cerebrospi… Amprion-… Positive   <NA>    
 8   3609 Male   PD     BL             Cerebrospi… Amprion-… Positive   <NA>    
 9  50509 Female PD     V04            Cerebrospi… Amprion-… Positive   Type1   
10 179752 Female PD     BL             Cerebrospi… Amprion-… Negative   <NA>    
# ℹ 40 more variables: Fmax_24h_Rep1 <dbl>, Fmax_24h_Rep2 <dbl>,
#   Fmax_24h_Rep3 <dbl>, TTT_24h_Rep1 <dbl>, TTT_24h_Rep2 <dbl>,
#   TTT_24h_Rep3 <dbl>, AUC_24h_Rep1 <dbl>, AUC_24h_Rep2 <dbl>,
#   AUC_24h_Rep3 <dbl>, TSmax_24h_Rep1 <dbl>, TSmax_24h_Rep2 <dbl>,
#   TSmax_24h_Rep3 <dbl>, SLOPEMax_24h_Rep1 <dbl>, SLOPEMax_24h_Rep2 <dbl>,
#   SLOPEMax_24h_Rep3 <dbl>, Fmax_150h_Rep1 <dbl>, Fmax_150h_Rep2 <dbl>,
#   Fmax_150h_Rep3 <dbl>, TTT_150h_Rep1 <dbl>, TTT_150h_Rep2 <dbl>, …

Clean the data

Deleting columns

=======
library("tidyverse")

Making sure the directories exist

raw_dir <- "../data/_raw/"

if( !dir.exists(raw_dir) ){
  dir.create(path = raw_dir)
}

Please see the README to for the data retrieval and correct placement.

raw_data <- read_csv("../data/_raw/SAA_Biospecimen_Analysis_Results_01Nov2025.csv")

Saving data-frame

write_tsv(raw_data, "../data/01_dat_load.tsv")

02_clean

library(tidyverse)

Loading the raw data

raw_data <- read_tsv("../data/01_dat_load.tsv")

Clean the data

Selecting only PD patients:

SWEDD is an outdated ID, mentioned in the PPMI data guide, and we will not be using the data from these patients.

tidy_data <- raw_data |>
  filter(COHORT != "SWEDD")

Deleting columns

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Some columns in the data-set are not part of the chosen analytic points. The columns chosen to omit is at the cause are either:

  • Irrelevant to the analysis such as the name of the institution or the responsible practitioner.

  • Contain the same value for all observation such as the TYPE (which is always a cerebrospinal fluid sample) and SAAMethod (results always made on the basis of a SAA).

  • Only have a value in observed in one of the projects: SAA_Type, TSmax, T50.

  • Lack meassurements for most observations like SampleVolume.

<<<<<<< HEAD
# Neglecting the unused columns:
tidy_data <- raw_data |>
  select(-"COHORT",
         -"PI_NAME",
         -"PI_INSTITUTION",
         -"TYPE",
         -"SAAMethod",
         -"SAA_Type",
         -starts_with("TSmax"),
         -starts_with("SLOPE"),
         -starts_with("T50"),
         -starts_with("Sample"))

Missing value interpretation

One person (patient_number = 41184) has the value “U01” in the clinical event column. As no information is found explaining the value, the patient is removed from the data.

tidy_data <- tidy_data |>
  filter(CLINICAL_EVENT != "U01")

Elongating the data

Originally all repetitions of Fmax, TTT and AUC had its own column. To make the data more comprehensible all repetitions are stored in one column for each measurement. The data will then contain only one column of each measurement (Fmax/AUC/TTT), a column with the duration (either 24 or 150 hours) and a column defining the repetition number (1-3) of the measurement.

tidy_data <- tidy_data |> 
  pivot_longer(
    cols = matches("Fmax|TTT|AUC"), 
    names_to = c("measurement", "duration", "rep"), 
    names_pattern = ("(Fmax|TTT|AUC)_(\\d+)h_Rep(\\d+)"),
    values_to = "value",
    values_drop_na = TRUE
  ) |> 
  pivot_wider(
    names_from = measurement,
    values_from = value
  )
tidy_data |> 
  slice_sample(n = 10)
# A tibble: 10 × 14
    PATNO SEX    CLINICAL_EVENT SAA_Status InstrumentRep1 InstrumentRep2
    <dbl> <chr>  <chr>          <chr>               <dbl>          <dbl>
 1 293633 Female BL             Positive                5              5
 2 135861 Male   BL             Positive                2              2
 3  40731 Male   BL             Positive                4              4
 4  41295 Female BL             Positive                6              6
 5  59507 Male   BL             Positive                4              4
 6   3184 Female BL             Positive                4              4
 7   3771 Female V04            Positive                6              6
 8   3608 Male   V04            Positive                5              5
 9   4012 Female BL             Positive                4              4
10 215275 Male   BL             Positive                9              9
# ℹ 8 more variables: InstrumentRep3 <dbl>, RUNDATE <date>, PROJECTID <dbl>,
#   duration <chr>, rep <chr>, Fmax <dbl>, TTT <dbl>, AUC <dbl>

Reducing the “Instrument” columns

When looking at the data it seemed that the instrument used for one observation for all three repetitions was the same. We found that it was the case for all observations when checking:

all(tidy_data$InstrumentRep1 == tidy_data$InstrumentRep2 & tidy_data$InstrumentRep2 == tidy_data$InstrumentRep3)
=======
# Neglecting the unused columns:
tidy_data <- tidy_data |>
  select(-"COHORT",
         -"PI_NAME",
         -"PI_INSTITUTION",
         -"TYPE",
         -"SAAMethod",
         -"SAA_Type",
         -starts_with("TSmax"),
         -starts_with("SLOPE"),
         -starts_with("T50"),
         -starts_with("Sample"))

Missing value interpretation

One person (patient_number = 41184) has the value “U01” in the clinical event column. As no information is found explaining the value, the patient is removed from the data.

tidy_data <- tidy_data |>
  filter(CLINICAL_EVENT != "U01")

Elongating the data

Originally all repetitions of Fmax, TTT and AUC had its own column. To make the data more comprehensible all repetitions are stored in one column for each measurement. The data will then contain only one column of each measurement (Fmax/AUC/TTT), a column with the duration (either 24 or 150 hours) and a column defining the repetition number (1-3) of the measurement.

tidy_data <- tidy_data |> 
  pivot_longer(
    cols = matches("Fmax|TTT|AUC"), 
    names_to = c("measurement", "duration", "rep"), 
    names_pattern = ("(Fmax|TTT|AUC)_(\\d+)h_Rep(\\d+)"),
    values_to = "value",
    values_drop_na = TRUE
  ) |> 
  
  pivot_wider(
    names_from = measurement,
    values_from = value
  )

Reducing the “Instrument” columns

When looking at the data it seemed that the instrument used for one observation for all three repetitions was the same. We found that it was the case for all observations when checking:

all(
  tidy_data$InstrumentRep1 == tidy_data$InstrumentRep2 & tidy_data$InstrumentRep2 == tidy_data$InstrumentRep3
  )
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
[1] TRUE

To reduce the number of columns, the instrument is stored in a single column.

<<<<<<< HEAD
tidy_data <- tidy_data |> 
  select(!InstrumentRep2 & !InstrumentRep3) |> 
  rename(instrument = InstrumentRep1)

Renaming

To get a more clear overview of the data-set some of the names were edited and the naming convention was changed.

tidy_data <- tidy_data |>
  rename(
    patient_number = PATNO,
    sex = SEX,
    project = PROJECTID,
    clinical_event = CLINICAL_EVENT,
    saa_result = SAA_Status,
    rundate = RUNDATE,
    fmax = Fmax,
    ttt = TTT,
    auc = AUC)
tidy_data |> 
  slice_sample(n = 10)
=======
tidy_data <- tidy_data |> 
  select(!InstrumentRep2 & !InstrumentRep3) |> 
  rename(instrument = InstrumentRep1)

Renaming the columns to preferred naming convention

tidy_data <- tidy_data |>
  rename(
    patient_number = PATNO,
    sex = SEX,
    project = PROJECTID,
    clinical_event = CLINICAL_EVENT,
    saa_result = SAA_Status,
    rundate = RUNDATE,
    fmax = Fmax,
    ttt = TTT,
    auc = AUC)
tidy_data |> 
  slice_sample(n = 10)
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
# A tibble: 10 × 12
   patient_number sex    clinical_event saa_result instrument rundate    project
            <dbl> <chr>  <chr>          <chr>           <dbl> <date>       <dbl>
<<<<<<< HEAD
 1         103566 Male   BL             Positive            2 2023-08-17     237
 2           3823 Male   V04            Positive            5 2024-10-28     237
 3         148699 Female BL             Positive            4 2023-08-17     237
 4          41486 Male   V04            Positive            4 2023-12-18     237
 5           3617 Male   BL             Positive            2 2022-03-24     155
 6           3230 Male   V04            Inconclus…          5 2024-12-10     237
 7           3419 Female BL             Positive            2 2022-02-24     155
 8           3831 Male   BL             Positive            2 2022-03-10     155
 9         103813 Female BL             Positive            2 2023-01-05     237
10          70354 Male   BL             Positive            4 2022-03-03     155
=======
 1           4035 Male   BL             Positive            2 2022-05-19     155
 2         236926 Male   BL             Positive            4 2024-04-29     237
 3         121109 Female BL             Positive            2 2023-08-16     237
 4           3951 Male   BL             Positive            2 2022-04-21     155
 5         292825 Male   BL             Positive            4 2024-04-29     237
 6          41282 Female BL             Inconclus…          4 2022-05-26     155
 7         325566 Male   BL             Positive            4 2024-05-29     237
 8          41305 Female BL             Positive            6 2022-03-03     155
 9          40806 Male   V08            Positive            6 2024-12-16     237
10         313162 Male   BL             Positive           10 2024-05-03     237
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
# ℹ 5 more variables: duration <chr>, rep <chr>, fmax <dbl>, ttt <dbl>,
#   auc <dbl>
<<<<<<< HEAD

Changing the column-types

For further analysis the column-type was changed to ensure correct plotting.

tidy_data <- tidy_data |>
  mutate(across(c(project, 
                  duration, 
                  rep, 
                  instrument), 
                
                as.character))
tidy_data |> 
  slice_sample(n = 10)
# A tibble: 10 × 12
   patient_number sex    clinical_event saa_result instrument rundate    project
            <dbl> <chr>  <chr>          <chr>      <chr>      <date>     <chr>  
 1           3178 Male   BL             Positive   2          2023-01-04 237    
 2         293445 Female BL             Negative   9          2024-05-03 237    
 3           4122 Male   BL             Inconclus… 6          2025-05-28 237    
 4           3333 Male   BL             Positive   6          2024-12-10 237    
 5           3435 Male   V04            Positive   5          2024-12-10 237    
 6         173266 Male   BL             Positive   2          2023-08-17 237    
 7           3325 Female BL             Positive   6          2022-02-17 155    
 8           4115 Male   V04            Positive   5          2024-12-10 237    
 9           3830 Female BL             Positive   2          2022-05-05 155    
10         156676 Male   BL             Positive   9          2024-04-29 237    
# ℹ 5 more variables: duration <chr>, rep <chr>, fmax <dbl>, ttt <dbl>,
#   auc <dbl>

Relocating columns

The columns containing important information about an observation was moved to the front.

tidy_data <- tidy_data |>
    relocate(project)

tidy_data <- tidy_data |>
    relocate(duration:auc, .after = saa_result)

tidy_data |> 
  slice_sample(n = 10)
# A tibble: 10 × 12
   project patient_number sex    clinical_event saa_result duration rep     fmax
   <chr>            <dbl> <chr>  <chr>          <chr>      <chr>    <chr>  <dbl>
 1 237               4122 Male   BL             Negative   24       3        529
 2 155              74817 Male   BL             Positive   150      1     124938
 3 237               3528 Female V06            Positive   24       2     196534
 4 155               3445 Male   BL             Positive   150      2     109094
 5 155              42171 Male   BL             Positive   150      3     155818
 6 237             206833 Male   BL             Negative   24       2        518
 7 237             243027 Male   V04            Negative   24       1        498
 8 237             212275 Female BL             Negative   24       1        668
 9 155              40691 Female BL             Positive   150      3      84221
10 155              52678 Male   BL             Positive   150      1     139329
# ℹ 4 more variables: ttt <dbl>, auc <dbl>, instrument <chr>, rundate <date>

Cluster the patients’ observations

Some patients have multiple observations and uses the patient_number as an identifier. One patients observations are all gathered together by arranging the patient_number and individually put in order depending on their visit.

tidy_data <- tidy_data |>
  arrange(patient_number, rundate)

Save the cleaned data

write_tsv(tidy_data, "../data/02_dat_clean.tsv")

03_augment

library(tidyverse)
library(here)
Warning: package 'here' was built under R version 4.5.2
library(fs)

Loading the clean data

tidy_data <- read_tsv("../data/02_dat_clean.tsv")

Augment Data

Finding days from baseline

Some patients were tested multiple times at different dates. We calculate the days from baseline and add a colum with this information.

It should be noted that for 2 patients (40771 and 40754) the rundate appears to have been inputted wrong, so that it appears as if several samples were taken on just one day, despite them having different clinical_events. Therefore, it is insufficient to just sort by days_from_baseline, and clinical_events is therefore kept.

tidy_data <- tidy_data |> 
  mutate(rundate = as.Date(rundate)) |> 
  group_by(patient_number) |> 
  mutate(days_from_baseline = as.numeric(rundate - min(rundate, na.rm = TRUE))) |> 
  ungroup()

Calculating means of Fmax, ttt and auc

Instead of working with three replicates of a single observation, moving forward the mean of the variable will be used to represent the observation.

The means are calculated of Fmax, ttt and auc from a single patient (patient_number) per visit (clinical_event).

means <- tidy_data |>
  group_by(patient_number, clinical_event, days_from_baseline) |>
  summarise(
    fmax_mean = mean(fmax, na.rm = TRUE),
    ttt_mean  = mean(ttt,  na.rm = TRUE),
    auc_mean  = mean(auc,  na.rm = TRUE),
    .groups = "drop"
  )

# Adding the means to the dataset
aug_data <- tidy_data |>
  left_join(means,
            by = c("patient_number", 
                   "clinical_event", 
                   "days_from_baseline"))

Performing a quality check

When PPMI performed their study, they choose a criteria the Fmax value must meet, for the SAA-result to indicate PD. The indication of PD is stored in the saa_result column as either positive, negative or inconclusive. To ensure they correctly assigned the saa_results to the observations, a quality check is made.

As the two projects differ in method, they used different requirements for each. The requirements used can be seen in the table below:

=======

Changing the column-types

source("09_proj_func.R")
tidy_data <- char_type_col(tidy_data)

Relocating columns

The columns containing important information about an observation was moved to the front.

tidy_data <- tidy_data |>
    relocate(project)

tidy_data <- tidy_data |>
    relocate(duration:auc, .after = saa_result)

Cluster the patients’ observations

Some patients have multiple observations and uses the patient_number as an identifier. One patients observations are all gathered together by arranging the patient_number and individually put in order depending on their visit.

tidy_data <- tidy_data |>
  arrange(patient_number, rundate)

Save the cleaned data

write_tsv(tidy_data, "../data/02_dat_clean.tsv")

03_augment

library(tidyverse)

Loading the clean data

tidy_data <- read_tsv("../data/02_dat_clean.tsv")

Augment Data

Finding days from baseline

Some patients were tested multiple times at different dates. We calculate the days from baseline and add a colum with this information.

It should be noted that for 2 patients (40771 and 40754) the rundate appears to have been inputted wrong, so that it appears as if several samples were taken on just one day, despite them having different clinical_events. Therefore, it is insufficient to just sort by days_from_baseline, and clinical_events is therefore kept.

tidy_data <- tidy_data |> 
  mutate(rundate = as.Date(rundate)) |> 
  group_by(patient_number) |> 
  mutate(days_from_baseline = as.numeric(rundate - min(rundate, 
                                                       na.rm = TRUE)
                                         )) |> 
  ungroup()

Calculating means of Fmax, ttt and auc

Instead of working with three replicates of a single observation, moving forward the mean of the variable will be used to represent the observation.

The means are calculated of Fmax, ttt and auc from a single patient (patient_number) per visit (clinical_event).

means <- tidy_data |>
  group_by(patient_number, clinical_event, days_from_baseline) |>
  
  summarise(
    fmax_mean = mean(fmax, na.rm = TRUE),
    ttt_mean  = mean(ttt, na.rm = TRUE),
    auc_mean  = mean(auc, na.rm = TRUE),
    .groups = "drop"
  )

aug_data <- tidy_data |>
  left_join(means,
            by = c("patient_number", 
                   "clinical_event", 
                   "days_from_baseline"))

Initiating a quality check

When PPMI performed their study, they choose a criteria the Fmax value must meet, for the SAA-result to indicate PD. To ensure they correctly assigned positive/negative/inconclusive, the saa_results to the observations, a quality check is made.

As the two projects differ in method, they used different requirements for each.

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
Result Project 155 - Criteria of Fmax
Positive All 3 replicates have Fmax ≥ 5,000 RFU
Negative 0 or 1 replicate has Fmax ≥ 5,000 RFU
Inconclusive Exactly 2 replicates have Fmax ≥ 5,000 RFU
Result Project 237 - Criteria of Fmax
Positive
  • All 3 replicates have Fmax ≥ 45,000 RFU

    or

  • Exactly 2 or 3 replicates have Fmax ≥ 3,000 RFU & < 45,000 RFU

    or

  • Exactly 2 replicates have Fmax ≥ 45,000 RFU & 1 replicate has Fmax ≥ 3,000 RFU & < 45,000 RFU

Negative
  • 0 or 1 replicate has Fmax ≥ 3,000 RFU
Inconclusive
  • Exactly 2 replicates have Fmax ≥ 45,000 RFU & 1 replicate has Fmax < 3,000 RFU

    or

  • Exactly 1 replicate has Fmax ≥ 45,000 RFU & 1 replicate has Fmax ≥ 3,000 RFU & < 45,000 RFU & 1 replicate has Fmax < 3,000 RFU

<<<<<<< HEAD

Prepare replicate vectors for SAA rule

reps <- tidy_data |>
  group_by(patient_number, project, clinical_event, days_from_baseline) |> 
  summarise(
    fmax_reps = list(fmax),
    .groups = "drop"
  )

aug_data <- aug_data |> 
  left_join(reps,
            by = c("patient_number", 
                   "project", 
                   "clinical_event",
                   "days_from_baseline"))

We prepare to calculate the status from the fmax values by first creating two functions with the rules:

saa_rule_155 <- function(x) {
  x <- unlist(x)
  n_pos <- sum(x >= 5000, na.rm = TRUE)

  case_when(
    n_pos == 3 ~ "Positive",
    n_pos <= 1 ~ "Negative",
    n_pos == 2 ~ "Inconclusive",
    TRUE ~ NA_character_
  )
}

saa_rule_237 <- function(x) {
  x <- unlist(x)

  n_ge_45000 <- sum(x >= 45000, na.rm = TRUE)
  n_ge_3000  <- sum(x >= 3000,  na.rm = TRUE)
  n_between  <- sum(x >= 3000 & x < 45000, na.rm = TRUE)
  n_lt_3000  <- sum(x < 3000, na.rm = TRUE)

  # Positive
  if (n_ge_45000 == 3) return("Positive")
  if (n_between == 2 | n_between == 3) return("Positive")
  if (n_ge_45000 == 2 & n_between == 1) return("Positive")

  # Negative
  if (n_ge_3000 <= 1) return("Negative")

  # Inconclusive
  if (length(x<3)) return("Inconclusive")
  if (n_ge_45000 == 2 & n_lt_3000 == 1) return("Inconclusive")
  if (n_ge_45000 == 1 & n_between == 1 & n_lt_3000 == 1) return("Inconclusive")

  NA_character_
}


Applying the functions and subsequently removing the generated fmax_reps column.

aug_data <- aug_data |> 
  rowwise() |> 
  mutate(
    saa_custom = case_when(
      project == "155" ~ saa_rule_155(fmax_reps),
      project == "237" ~ saa_rule_237(fmax_reps),
      TRUE ~ NA_character_
    )
  ) |> 
  ungroup()

aug_data <- aug_data |>  
  select(-fmax_reps)

Declaring datatype in some columns

aug_data <- aug_data |> 
  mutate(across(c(project, 
                  duration, 
                  rep, 
                  instrument), 
                
                as.character))

Save the augmented data

write_tsv(aug_data, "../data/03_aug_data.tsv")

Moving html files in the right place

old_dir <- here("R", "02_augment.html")
new_dir <- here("results", "02_augment.html")
if (file_exists(old_dir)) {
file_move(old_dir, new_dir)
}
=======

To prepare replicate vectors for SAA rule all three fmax from each observation is gathered in a list.

source("09_proj_func.R")

reps <- tidy_data |>
  group_by(patient_number, 
           project, 
           clinical_event, 
           days_from_baseline) |> 
  
  summarise(
    fmax_reps = list(fmax),
    .groups = "drop"
  )

aug_data <- aug_data |> 
  left_join(reps,
            by = c("patient_number", 
                   "project", 
                   "clinical_event",
                   "days_from_baseline"))

The two rules have been made as functions,

Applying the functions and subsequently removing the generated fmax_reps column:

source("09_proj_func.R")

aug_data <- aug_data |> 
  rowwise() |> 
  
  mutate(
    saa_custom = case_when(
      project == "155" ~ saa_rule_155(fmax_reps),
      project == "237" ~ saa_rule_237(fmax_reps),
      TRUE ~ NA_character_
    )
  ) |> 
  
  ungroup()

aug_data <- aug_data |>  
  select(-fmax_reps)

Save the augmented data

write_tsv(aug_data, "../data/03_aug_data.tsv")
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

04_describe

<<<<<<< HEAD
library(tidyverse)

Loading the augmented data

saa_aug_data <- read_tsv("../data/03_aug_data.tsv")

Describing the data

To get a better understanding of the data being worked with, some simple descriptive plots will be made.

A general theme has been made:

  theme_custom <- theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 13, margin = margin(b = 7)),
    plot.subtitle = element_text(size = 10, margin = margin(b = 12)),
    plot.caption = element_text(size = 8, margin = margin(t = 20)),
      
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),
    axis.title.y = element_text(size = 10, margin = margin(r = 10)),

    axis.text.x = element_text(size = 10, margin = margin(t = 5)),
    axis.text.y = element_text(size = 10, margin = margin(r = 5)),
    
    legend.position = "none",
    )
knitr::opts_chunk$set(
  fig.path = "../results/",
  fig.format = "png",
  echo = TRUE
)

Distribution across projects

The data contains observations made in two projects. The distribution from each projects can be seen in the plot as well as the male/female ratio.

project_distribution <- saa_aug_data |>
  mutate(project = as.character(project)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = sex)) + 
  scale_fill_manual(values = c("#da9283","#3C3C68"),
                    name = "Sex") +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Project Distribution",
       subtitle = "The distribution of observations and gender across project 155 and 237",
       x = "Project",
       y = "Observations")
project_distribution

=======
library(tidyverse)

Loading the augmented data

aug_data <- read_tsv("../data/03_aug_data.tsv")

Describing the data

To ensure uniformity across the different plots a general theme has been made:

  theme_custom <- theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 13, margin = margin(b = 7)),
    plot.subtitle = element_text(size = 10, margin = margin(b = 12)),
    plot.caption = element_text(size = 8, margin = margin(t = 20)),
      
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),
    axis.title.y = element_text(size = 10, margin = margin(r = 10)),

    axis.text.x = element_text(size = 10, margin = margin(t = 5)),
    axis.text.y = element_text(size = 10, margin = margin(r = 5)),
    
    legend.position = "none",
    )

Distribution of observations between projects

The data contains observations made in two projects. The distribution from each projects can be seen in the plot as well as the male/female ratio.

project_distribution <- aug_data |>
  mutate(project = as.character(project)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = sex)) + 
  scale_fill_manual(values = c("#da9283","#3C3C68"),
                    name = "Sex") +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Project Distribution",
       subtitle = "The distribution of observations and gender across project 155 and 237",
       x = "Project",
       y = "Observations",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_project_distribution.png",
       width = 10,  
       height = 5)

project_distribution

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
<<<<<<< HEAD

Individual participants

The data-set orignially contained more than 2000 observations. Some of the patients appear multiple times in the data set. The amount of different patients enrolled in the experiment is determined. Obs: we removed one patient during the cleaning because of a missing value interpretation.

patients_participants <- saa_aug_data |>
  distinct(patient_number) |>
  summarize(Participants = n())
patients_participants
=======

Individual participants

The data-set orignially contained more than 2000 observations. Some of the patients appear multiple times in the data set. The amount of different patients enrolled in the experiment is determined. Obs: we removed one patient during the cleaning because of a missing value interpretation.

patients_participants <- aug_data |>
  distinct(patient_number) |>
  summarize(Participants = n())
patients_participants
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
# A tibble: 1 × 1
  Participants
         <int>
<<<<<<< HEAD
1         1363

Patient repeats

The distribution of each patient’s latest experiment was plotted. Most of the data only contains information from the Baseline experiment. If the patients went for further experiments, most of them got the last assessment ~12 months (V04) after their Baseline.

multiple_experiments <- saa_aug_data |>
  arrange(desc(days_from_baseline)) |>
  distinct(patient_number, .keep_all = TRUE) |>
  ggplot(mapping = aes(x = clinical_event,
                       fill = clinical_event)) + 
  geom_bar() + 
  scale_fill_manual(values = c("#3C3C68","#627634","#545E75","#C03221", "#94BFBE","#7A1F15","#da9283","#474B24","#474B24","#474B24","#474B24")) +
  theme_custom + 
  labs(title = "Latest observation",
       subtitle = "The latest observation of each patient",
       x = "Latest Observation",
       y = "Participants")
multiple_experiments

Result of the SAA

All patients enrolled was diagnosed with PD previously to SAA analysis. The result of the SAA does not follow the PD diagnosis completely. 1363 individual patients are part of augmented data, but only 1160 were determined positive by SAA at their most recent observation.

saa_aug_data |>
  arrange(desc(days_from_baseline)) |>
  distinct(patient_number, .keep_all = TRUE) |>
  filter(saa_result == "Positive") |>
  summarize(Positive_individuals = n())
======= 1 1300

Number of observations

1 observation is rep 1-3

observations <- aug_data |>
  group_by(clinical_event) |>
  distinct(patient_number) |>
  ungroup()|>
  summarize(Observations = n())
observations
# A tibble: 1 × 1
  Observations
         <int>
1         1889

Result of the SAA

All patients enrolled was diagnosed with PD previously to SAA analysis. The result of the SAA does not follow the PD diagnosis completely. The number of saa-positive patients at their latest visit.

aug_data |>
  arrange(desc(days_from_baseline)) |>
  distinct(patient_number, .keep_all = TRUE) |>
  filter(saa_result == "Positive") |>
  summarize(Positive_individuals = n())
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
# A tibble: 1 × 1
  Positive_individuals
                 <int>
<<<<<<< HEAD
1                 1160

Use of instrument

To analyse the samples of cerebrospinal fluid in the SAA, different equipment was used. In project 155 instrument 2, 4 and 6 were equally used. During project 237 these instruments are also used frequently in addition to instrument 5.

instruments_used <- saa_aug_data |>
  mutate(project = as.character(project)) |>
  mutate(instrument = as.character(instrument)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = instrument),
           position = "fill") + 
  scale_fill_manual(breaks = c("2","4","5","6","9","10"),
                    values = c("#da9283","#3C3C68","#C03221","#627634","#7A1F15","#94BFBE"),
                    name = "Instrument",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Used Instrument",
       subtitle = "The instrument used to analyze each observation",
       x = "Project",
       y = "Ratio")
instruments_used

======= 1 1149

Saa-result from all observations, patient repeats accepted

result_distribution <- aug_data |>
  mutate(project = as.character(project)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = saa_result)) + 
  scale_fill_manual(values = c("#7A1F15","#da9283","#94BFBE"),
                    name = "Result",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Result Distribution",
       subtitle = "The distribution of saa-result",
       x = "Project",
       y = "Observations",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_result_distribution.png",
       width = 10,  
       height = 5)

result_distribution

Patient repeats

The distribution of each patient’s latest experiment was plotted. Most of the data only contains information from the Baseline experiment. If the patients went for further experiments, most of them got the last assessment ~12 months (V04) after their Baseline.

multiple_experiments <- aug_data |>
  arrange(desc(days_from_baseline)) |>
  distinct(patient_number, .keep_all = TRUE) |>
  ggplot(mapping = aes(x = clinical_event,
                       fill = clinical_event)) + 
  geom_bar(show.legend = FALSE) + 
  scale_fill_manual(values = c("#3C3C68","#627634","#545E75","#C03221", "#94BFBE","#7A1F15","#da9283","#474B24","#474B24","#474B24","#474B24")) +
  theme_custom + 
  labs(title = "Latest observation",
       subtitle = "The latest observation of each patient",
       x = "Latest Observation",
       y = "Participants",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_multiple_experiments.png",
       width = 10,  
       height = 5)

multiple_experiments

Use of instrument

To analyse the samples of cerebrospinal fluid in the SAA, different equipment was used. In project 155 instrument 2, 4 and 6 were equally used. During project 237 these instruments are also used frequently in addition to instrument 5.

instruments_used <- aug_data |>
  mutate(project = as.character(project)) |>
  mutate(instrument = as.character(instrument)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = instrument),
           position = "fill") + 
  scale_fill_manual(breaks = c("2","4","5","6","9","10"),
                    values = c("#da9283","#7A1F15","#C03221","#94BFBE","#3C3C68","#627634"),
                    name = "Instrument",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Used Instrument",
       subtitle = "The instrument used to analyze each observation",
       x = "Project",
       y = "Ratio",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_instruments_used.png",
       width = 10,  
       height = 5)

instruments_used

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
<<<<<<< HEAD

High and low ranges of key values

To get a better understanding of the ranges of key values (fmax, auc, ttt) we look at a few examples at each end of the spectrum.

low_range <- function(df, column) {
  df2 <- df |> 
    filter(project == 155) |>
    arrange({{column}}) |>
    pull({{column}})
  mean_low_155 <- mean(df2[1:10], na.rm = TRUE)
  
  df3 <- df |>
    filter(project == 237) |>
    arrange({{column}}) |>
    pull({{column}})
  mean_low_237 <- mean(df3[1:10], na.rm = TRUE)
  return(list(mean_low_155, mean_low_237))
}

low_range(saa_aug_data, fmax)
[[1]]
[1] 231.1

[[2]]
[1] 431.6
low_range(saa_aug_data, auc)
[[1]]
[1] 2401999

[[2]]
[1] 2030936
low_range(saa_aug_data, ttt)
[[1]]
[1] 25.039

[[2]]
[1] 5.411
high_range <- function(df, column) {
  df2 <- df |> 
    filter(project == 155) |>
    arrange(desc({{column}})) |>
    pull({{column}})
  mean_high_155 <- mean(df2[1:10], na.rm = TRUE)
  
  df3 <- df |>
    filter(project == 237) |>
    arrange(desc({{column}})) |>
    pull({{column}})
  mean_high_237 <- mean(df3[1:10], na.rm = TRUE)
  return(list(mean_high_155, mean_high_237))
}

high_range(saa_aug_data, fmax)
[[1]]
[1] 171588

[[2]]
[1] 250669.6
high_range(saa_aug_data, auc)
[[1]]
[1] 40765540

[[2]]
[1] 10506900000
high_range(saa_aug_data, ttt)
[[1]]
[1] 139.697

[[2]]
[1] 23.518
library(patchwork)
Warning: package 'patchwork' was built under R version 4.5.2
((instruments_used + project_distribution) / multiple_experiments) +
  plot_layout(guides = "collect",
              heights = c(5,5,7)) &
  theme(legend.position = "bottom")

=======

High and low ranges of key values

To get a better understanding of the ranges of key values (fmax, auc, ttt) we look at a few examples at each end of the spectrum.

source("09_proj_func.R")

# The function returns 2 values: one from project 155 and one from 237:
low_range(aug_data, fmax, 50)
$Project_155
[1] 263.86

$Project_237
[1] 448.66
low_range(aug_data, auc, 50)
$Project_155
[1] 7723959

$Project_237
[1] 2364708
low_range(aug_data, ttt, 50)
$Project_155
[1] 38.9364

$Project_237
[1] 6.4352
source("09_proj_func.R")

# The function returns 2 values: one from project 155 and one from 237:
high_range(aug_data, fmax, 50)
$Project_155
[1] 159562.7

$Project_237
[1] 237574.2
high_range(aug_data, auc, 50)
$Project_155
[1] 36286298

$Project_237
[1] 9714900000
high_range(aug_data, ttt, 50)
$Project_155
[1] 122.1678

$Project_237
[1] 22.4912
library(patchwork)

((instruments_used + project_distribution) / multiple_experiments / result_distribution) +
  plot_layout(guides = "collect",
              heights = c(5,5,7,5)) &
  theme(legend.position = "bottom")

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
<<<<<<< HEAD

05_analysis_1

Load libraries

library(tidyverse)

Load the augmented data from file:

aug_data <- read_tsv("../data/03_aug_data.tsv")
Rows: 6515 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr   (4): sex, clinical_event, saa_result, saa_custom
dbl  (12): project, patient_number, duration, rep, fmax, ttt, auc, instrumen...
date  (1): rundate

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  theme_custom <- theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 13, margin = margin(b = 7)),
    plot.subtitle = element_text(size = 10, margin = margin(b = 12)),
    plot.caption = element_text(size = 8, margin = margin(t = 20)),
      
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),
    axis.title.y = element_text(size = 10, margin = margin(r = 10)),

    axis.text.x = element_text(size = 10, margin = margin(t = 5)),
    axis.text.y = element_text(size = 10, margin = margin(r = 5)),
    
    legend.position = "none",
    )
=======

05_analysis_1

library(tidyverse)

Loading the augmented data

aug_data <- read_tsv("../data/03_aug_data.tsv")

source("09_proj_func.R")
aug_data <- char_type_col(aug_data)
  theme_custom <- theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 13, margin = margin(b = 7)),
    plot.subtitle = element_text(size = 10, margin = margin(b = 12)),
    plot.caption = element_text(size = 8, margin = margin(t = 20)),
      
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),
    axis.title.y = element_text(size = 10, margin = margin(r = 10)),

    axis.text.x = element_text(size = 10, margin = margin(t = 5)),
    axis.text.y = element_text(size = 10, margin = margin(r = 5)),
    
    legend.position = "none",
    )
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Performing the quality check of saa_result

During the augmentation a new column was made using the same criteria published by PPMI used to create their saa_result column. We check for any variations between their positive/negative/inconclusive results and the new column.

<<<<<<< HEAD
aug_data |>
  filter(saa_result != saa_custom) |>
  select(project,
         patient_number,
         saa_result,
         saa_custom,
         rep,
         fmax
         ) |>
  pivot_wider(
    names_from = rep,
    values_from = fmax,
    names_prefix = "rep_"
    )
# A tibble: 9 × 7
  project patient_number saa_result saa_custom    rep_1  rep_2 rep_3
    <dbl>          <dbl> <chr>      <chr>         <dbl>  <dbl> <dbl>
1     155           3285 Negative   Inconclusive   7564  96765   277
2     155           3426 Negative   Inconclusive  35881    277  7249
3     155           3473 Negative   Inconclusive 124560    317 10256
4     155           3581 Negative   Inconclusive  14005 163545   326
5     155           3753 Negative   Inconclusive   5361    306 18058
6     155           3972 Negative   Inconclusive    290   8037 37230
7     155           4076 Negative   Inconclusive  75830    309  6933
8     155           4121 Negative   Inconclusive  88234  11994   317
9     155          41749 Negative   Inconclusive   4885   5708 35796

As seen above 9 patients have a negative saa_result, whereas when the column were recreated the results were inconclusive. When looking at the criteria given from PPMI the result should have been inconclusive we found.

=======
mismatches_saa_results <- aug_data |>
  filter(saa_result != saa_custom) |>
  select(project,
         patient_number,
         saa_result,
         saa_custom,
         rep,
         fmax
         ) |>
  
  pivot_wider(
    names_from = rep,
    values_from = fmax,
    names_prefix = "rep_"
    )

mismatches_saa_results
# A tibble: 7 × 7
  project patient_number saa_result saa_custom    rep_1 rep_2 rep_3
  <chr>            <dbl> <chr>      <chr>         <dbl> <dbl> <dbl>
1 155               3285 Negative   Inconclusive   7564 96765   277
2 155               3473 Negative   Inconclusive 124560   317 10256
3 155               3753 Negative   Inconclusive   5361   306 18058
4 155               3972 Negative   Inconclusive    290  8037 37230
5 155               4076 Negative   Inconclusive  75830   309  6933
6 155               4121 Negative   Inconclusive  88234 11994   317
7 155              41749 Negative   Inconclusive   4885  5708 35796

As seen above some patients have a negative saa_result, whereas when the column were recreated the results were inconclusive. When looking at the criteria given from PPMI the result should have been inconclusive we found.

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
Result Project 155 - Criteria of Fmax
Positive All 3 replicates have Fmax ≥ 5,000 RFU
Negative 0 or 1 replicate has Fmax ≥ 5,000 RFU
Inconclusive Exactly 2 replicates have Fmax ≥ 5,000 RFU

In most cases it looks like they decided to assign the negative value if one of the fmax repetition had a much lower value even if 2 replicates have fmax above 5000 RFU.

A visualization can be seen below:

<<<<<<< HEAD
aug_data |>
  mutate(match = saa_result == saa_custom) |>
  ggplot(aes(x = match, fill = match)) +
  geom_bar() +
  scale_fill_manual(values = c("#da9283","#3C3C68")) +
  facet_wrap(~ project) +
  theme_custom + 
  labs(title = "Match versus mismacth",
       subtitle = "Comparison between PPMI's saa_result and custom made results column",
       x = "Match",
       y = "Count")

=======
aug_data |>
  mutate(match = saa_result == saa_custom) |>
  ggplot(aes(x = match, 
             fill = match)
         ) +
  geom_bar() +
  scale_fill_manual(values = c("#da9283","#3C3C68")) +
  facet_wrap(~ project) +
  theme_custom + 
  labs(title = "Match versus mismacth",
       subtitle = "Comparison between PPMI's saa_result and custom made results column",
       x = "Match",
       y = "Count",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/05_quality_check.png",
       width = 10,  
       height = 5)

To visualize the 7 mismatches:

mismatches_saa_results |>
  pivot_longer(
    cols = starts_with("rep"),
    names_to = "rep",
    values_to = "fmax"
  ) |>
  ggplot(aes(x = rep,
             y = fmax,
             group = patient_number)) + 
  
  geom_line(color = "#C03221",
            alpha = 0.5) +
  
  geom_point(size = 3, 
             color = "#C03221") +
  
  geom_hline(yintercept = 5000, 
             linetype = "dashed") +
  
  facet_wrap(~ patient_number, 
             scales = "free_y",
             ncol = 4) +
  
  theme_minimal() +
  theme_custom +
  
  labs(
    title = "Mismatch cases: 155 negative vs our inconclusive",
    subtitle = "7 mismatches (Reported as negative, by definition inconclusive)",
    x = "Replicate",
    y = "Fmax (RFU)",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  )

ggsave(filename = "../results/05_mismatched_saa_plot.png",
       width = 14.5,  
       height = 6.5)  

A log2 plot:

mismatches_saa_results |>
  pivot_longer(
    cols = starts_with("rep"),
    names_to = "rep",
    values_to = "fmax"
  ) |>
  ggplot(aes(x = rep,
             y = log2(fmax),
             group = patient_number)) + 
  
  geom_line(color = "#C03221",
            alpha = 0.5) +
  
  geom_point(size = 3, 
             color = "#C03221") +
  
  geom_hline(yintercept = log2(5000), 
             linetype = "dashed") +
  
  facet_wrap(~ patient_number, 
             scales = "free_y",
             ncol = 4) +
  
  theme_minimal() +
  theme_custom +
  
  labs(
    title = "Mismatch cases: 155 negative vs our inconclusive (Log2)",
    subtitle = "7 mismatches (Reported as negative, by definition inconclusive)",
    x = "Replicate",
    y = "log2 of Fmax (RFU)",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  )

ggsave(filename = "../results/05_mismatched_saa_plot_log.png",
       width = 14.5,  
       height = 6.5)  

There is maybe an extra criteria

mismatches_saa_results |>
  pivot_longer(
    cols = starts_with("rep"),
    names_to = "rep",
    values_to = "fmax"
  ) |>
  summarize(median = median(fmax),
            fmax,
            vari = max(fmax)-min(fmax),
            mean = mean(fmax),
            .by = patient_number) |> 
  
  arrange(fmax)
# A tibble: 21 × 5
   patient_number median  fmax   vari   mean
            <dbl>  <dbl> <dbl>  <dbl>  <dbl>
 1           3285   7564   277  96488 34869.
 2           3972   8037   290  36940 15186.
 3           3753   5361   306  17752  7908.
 4           4076   6933   309  75521 27691.
 5           3473  10256   317 124243 45044.
 6           4121  11994   317  87917 33515 
 7          41749   5708  4885  30911 15463 
 8           3753   5361  5361  17752  7908.
 9          41749   5708  5708  30911 15463 
10           4076   6933  6933  75521 27691.
# ℹ 11 more rows
aug_data |>
  filter(saa_result == "Inconclusive") |>
  ungroup() |>
  summarize(median = median(fmax),
            fmax,
            vari = max(fmax)-min(fmax),
            mean = mean(fmax),
            .by = patient_number) |> 
  arrange(fmax)
# A tibble: 146 × 5
   patient_number median  fmax   vari   mean
            <dbl>  <dbl> <dbl>  <dbl>  <dbl>
 1           3269 64985    246  68955 44811.
 2           3406 71973    262 107042 59846.
 3           3440  5380    264  14388  6765.
 4           3708 21002    265  34597 18710.
 5          41291 58786.   274 167947 71932.
 6           3387  8866    277  14443  7954.
 7         101124 43462    281  49603 31209 
 8          41488 36637    284  90658 39357 
 9          50157 80813    302 125574 68997 
10          41282 34362    306  90380 41785.
# ℹ 136 more rows
aug_data |>
  filter(saa_result == "Negative") |>
  ungroup() |>
  summarize(median = median(fmax),
            fmax,
            vari = max(fmax)-min(fmax),
            mean = mean(fmax),
            .by = patient_number) |> 
  arrange(fmax)
# A tibble: 954 × 5
   patient_number median  fmax   vari   mean
            <dbl>  <dbl> <dbl>  <dbl>  <dbl>
 1         100018    339   209  28111  4982.
 2         100018    339   214  28111  4982.
 3         100018    339   216  28111  4982.
 4         101092    413   216   6968  1513.
 5         101092    413   225   6968  1513.
 6         101092    413   228   6968  1513.
 7          41886    855   257   6204  1842.
 8          41314    395   265 135843 22996 
 9          71978    441   265    395   450.
10           3233    270   266  26106  8969.
# ℹ 944 more rows
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

06_analysis_2

Loading libraries and loading the data

<<<<<<< HEAD
library(tidyverse)

aug_data <- read_tsv("../data/03_aug_data.tsv")

aug_data <- aug_data |> 
  mutate(across(c(project, 
                  duration, 
                  rep, 
                  instrument), 
                
                as.character))
=======
library(tidyverse)

aug_data <- read_tsv("../data/03_aug_data.tsv")

#making sure that the columns is the right type
source("09_proj_func.R")
aug_data <- char_type_col(aug_data)
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Defining colors and theme

<<<<<<< HEAD
my_cols <- c("#3C3C68","#627634","#C03221","#545E75", "#94BFBE", "#7A1F15",
             "#da9283" , "#474B24")

theme_custom <- theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 13, margin = margin(b = 7)),
    plot.subtitle = element_text(size = 10, margin = margin(b = 12)),
    plot.caption = element_text(size = 8, margin = margin(t = 20)),
      
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),
    axis.title.y = element_text(size = 10, margin = margin(r = 10)),

    axis.text.x = element_text(size = 10, margin = margin(t = 5)),
    axis.text.y = element_text(size = 10, margin = margin(r = 5)),
    
    legend.position = "none",
    )

Analysis of Fmax

=======
my_cols <- c("#3C3C68","#627634","#C03221","#545E75", "#94BFBE", "#7A1F15",
             "#da9283" , "#474B24")

theme_custom <- theme_minimal(base_size = 14) +
  
  theme(
    plot.title = element_text(face = "bold", 
                              size = 13, 
                              margin = margin(b = 7)
                              ),
    
    plot.subtitle = element_text(size = 10, 
                                 margin = margin(b = 12)
                                 ),
    
    plot.caption = element_text(size = 8, 
                                margin = margin(t = 20)
                                ),
      
    axis.title.x = element_text(size = 10, 
                                margin = margin(t = 10)
                                ),
    
    axis.title.y = element_text(size = 10, 
                                margin = margin(r = 10)
                                ),

    axis.text.x = element_text(size = 10, 
                               margin = margin(t = 5)
                               ),
    
    axis.text.y = element_text(size = 10, 
                               margin = margin(r = 5)
                               ),
    
    legend.position = "none",
    )

Analysis of Fmax and SAA results

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Seed Amplification Assays (SAA) detect α-synuclein aggregates in cerebrospinal fluid (CSF) and monitors fluorescence intensity over time. The point at which fluorescence reaches its maximum (Fmax) reflects the level of α-synuclein aggregation and is therefore a potential diagnostic indicator for Parkinson’s disease (PD).

The two projects perform the assays under different variables:

<<<<<<< HEAD

Variations in assay duration, cycle length, and reaction mixture composition mean that fluorescence values (Fmax) are not directly comparable across projects without normalization. Because assay duration, cycle frequency, and reaction mixture differ between projects, fluorescence values (Fmax) cannot be directly compared without normalizing and looking at variability and protocol-specific characteristics.

The goal of this analysis is therefore to evaluate the precision, consistency, and overall stability of the two assays, including whether instrument differences contribute to variability.

Comparison of Overall Fmax Levels

To illustrate the point stated above we have created the plot below. Here we can see very different absolute Fmax values between the groups, but a very similar distribution and shape of the graph. Indicating a similarity in the behavior of each data set.

p_fmax_mean_dist <- aug_data |> 
  ggplot(
    aes(
      x = fmax_mean,
      y = project,
      color = project,
      fill = project
      )
    ) + 
  
  geom_violin(alpha = 0.5, 
              width = 0.8, 
              trim = FALSE) + 
  
  geom_boxplot(width = 0.12, 
               color = "black", 
               outlier.alpha = 0.3) +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  theme_minimal(base_size = 14) +
  
  labs(
    title = "Distribution of Mean Fmax Across SAA Projects",
    subtitle = "Comparison between 24-hour (Project 237) and 150-hour (Project 155) α-synuclein SAA assays",
    x = "Mean Fmax [RFU]",
    y = "Project",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) + 
  theme_custom +
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/06_fmax_mean_dist.png",
       width = 10,  
       height = 5)  

p_fmax_mean_dist

=======

Variations in assay duration, cycle length, and reaction mixture composition mean that fluorescence values (Fmax) are not directly comparable across projects.

Because assay duration, cycle frequency, and reaction mixture differ between projects, fluorescence values (Fmax) cannot be directly compared without looking at protocol-specific characteristics.

We will look at the separation and overlap in fmax, when determining SAA result.

SAA result distribution

We will look at how the ratio of negative, positive and inconclusive is

saa_result_dist <- aug_data |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = saa_result),
           position = "fill") + 
  
  scale_fill_manual(values = c("#da9283","#7A1F15","#94BFBE"),
                    name = "SAA result",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "SAA result distribution",
       subtitle = "SAA result ratio from the two projects",
       x = "Project",
       y = "Ratio",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/06_saa_result_dist.png",
       width = 10,  
       height = 5)   

saa_result_dist

It seems to be very similar - not a very big difference in the ratio. Trying to see the separation:

ANDREASES KOMMER HER

07_analysis_3

Loading libraries and loading the data

library(tidyverse)

aug_data <- read_tsv("../data/03_aug_data.tsv")

#making sure that the columns is the right type

source("09_proj_func.R")
aug_data <- char_type_col(aug_data)

Defining colors and theme

my_cols <- c("#3C3C68","#627634","#C03221","#545E75", "#94BFBE", "#7A1F15",
             "#da9283" , "#474B24")

theme_custom <- theme_minimal(base_size = 14) +
  
  theme(
    plot.title = element_text(face = "bold", 
                              size = 13, 
                              margin = margin(b = 7)
                              ),
    
    plot.subtitle = element_text(size = 10, 
                                 margin = margin(b = 12)
                                 ),
    
    plot.caption = element_text(size = 8, 
                                margin = margin(t = 20)
                                ),
      
    axis.title.x = element_text(size = 10, 
                                margin = margin(t = 10)
                                ),
    
    axis.title.y = element_text(size = 10, 
                                margin = margin(r = 10)
                                ),

    axis.text.x = element_text(size = 10, 
                               margin = margin(t = 5)
                               ),
    
    axis.text.y = element_text(size = 10, 
                               margin = margin(r = 5)
                               ),
    
    legend.position = "none",
    )

Goal of this analysis

The goal of this analysis is to evaluate the consistency of the two assays, including whether instrument differences contribute to variability.

Comparison of Overall Fmax Levels

p_fmax_mean_dist <- aug_data |> 
  ggplot(
    aes(
      x = fmax_mean,
      y = project,
      color = project,
      fill = project
      )
    ) + 
  
  geom_violin(alpha = 0.5, 
              width = 0.8, 
              trim = FALSE) + 
  
  geom_boxplot(width = 0.12, 
               color = "black", 
               outlier.alpha = 0.3) +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  theme_minimal(base_size = 14) +
  
  labs(
    title = "Distribution of Mean Fmax Across SAA Projects",
    subtitle = "Comparison between 24-hour (Project 237) and 
    150-hour (Project 155) α-synuclein SAA assays",
    x = "Mean Fmax [RFU]",
    y = "Project",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) + 
  
  theme_custom +
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/07_fmax_mean_dist.png",
       width = 10,  
       height = 5)  

p_fmax_mean_dist

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Although absolute values differ, the focus of this analysis is not the magnitude of Fmax but the reproducibility of replicate measurements within each project.

<<<<<<< HEAD

Consistency of Technical Replicates

Distribution of Replicate Fmax Values

Means alone can’t be used to compare the consistency of the two assays. To do this, we will compare Fmax across replicates 1–3 for each project.

p_fmax_by_rep <- ggplot(aug_data, aes(
    x = fmax, 
    y = rep, 
    color = project, 
    fill = project
  )) +
  
  geom_boxplot(alpha = 0.7) +
  
  facet_wrap(~ project, scales = "free_x") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Replicate Variation in Fmax",
    subtitle = "Three technical replicates per sample in Projects 155 and 237",
    x = "Fmax [RFU]",
    y = "Replicate Number",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  theme(legend.position = "none") + 
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/06_fmax_by_rep.png",
       width = 10,  
       height = 5)   

p_fmax_by_rep

=======

Consistency of Technical Replicates

Distribution of Replicate Fmax Values

Means alone can’t be used to compare the consistency of the two assays. To do this, we will compare Fmax across replicates 1–3 for each project.

p_fmax_by_rep <- ggplot(aug_data, aes(
    x = fmax, 
    y = rep, 
    color = project, 
    fill = project
  )) +
  
  geom_boxplot(alpha = 0.7) +
  
  facet_wrap(~ project, scales = "free_x") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Replicate Variation in Fmax",
    subtitle = "Three technical replicates per sample in Projects 155 and 237",
    x = "Fmax [RFU]",
    y = "Replicate Number",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  theme(legend.position = "none") + 
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/07_fmax_by_rep.png",
       width = 10,  
       height = 5)   

p_fmax_by_rep

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
<<<<<<< HEAD
repeats_fmax_stats <- aug_data |> 
  ungroup() |> 
  summarize(mean=mean(fmax), 
            median=median(fmax),
            IQR = quantile(fmax,.75)- quantile(fmax,.25),
            .by = c(project, rep))
=======
repeats_fmax_stats <- aug_data |> 
  ungroup() |> 
  summarize(mean=mean(fmax), 
            median=median(fmax),
            IQR = quantile(fmax,.75)- quantile(fmax,.25),
            .by = c(project, rep))
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Summary statistics:

Project Rep 1 Median Rep 2 Median Rep 3 Median Variation (Range)
155 72,926 70,262 69,721 3,205
237 142,138 135,570 137,200 6,568

Observations

  • Project 155: Shows a small, gradual decrease from replicate 1 to 3.

  • Project 237: Shows a dip in replicate 2, with higher values in replicates 1 and 3.

These patterns are consistent and most likely reflect behavior that is specific for protocol.

<<<<<<< HEAD

Replicate Variability (IQR)

=======

Replicate Variability (IQR)

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
Project Rep 1 IQR Rep 2 IQR Rep 3 IQR Variation
155 ~69,736 ~69,404 ~82,973 13,569
237 ~59,530 ~63,910 ~63,179 4,380

Interpretation

  • Project 237: shows a more “tight” replicate clustering, indicating greater precision.

  • Project 155: shows larger and less stable variability, especially in replicate 3.

<<<<<<< HEAD

Replicate Standard Deviation (SD)

Standard deviation (SD) of Fmax across replicates measures stability of the assays.

sd_saa_data <- aug_data |> 
  group_by(patient_number, rundate) |> 
  mutate(fmax_sd = sd(fmax))
p_sd_by_project <- sd_saa_data |> 
  filter(instrument %in% c(2, 4, 6)) |> 
  ggplot(aes(x = fmax_sd, 
             y = project, 
             fill = project)) +
  
  geom_boxplot(alpha = 0.8) +
  
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Replicate Standard Deviation of Fmax",
    subtitle = "Higher SD indicates lower assay precision",
    x = "Standard Deviation of Fmax [RFU]",
    y = "Project",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/06_sd_by_project.png",
       width = 12,  
       height = 4.7)

p_sd_by_project

=======

Replicate Standard Deviation (SD)

Standard deviation (SD) of Fmax across replicates measures stability of the assays.

sd_saa_data <- aug_data |> 
  group_by(patient_number, rundate) |> 
  mutate(fmax_sd = sd(fmax))
p_sd_by_project <- sd_saa_data |> 
  filter(instrument %in% c(2, 4, 6)) |> 
  ggplot(aes(x = fmax_sd, 
             y = project, 
             fill = project)) +
  
  geom_boxplot(alpha = 0.8) +
  
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Replicate Standard Deviation of Fmax",
    subtitle = "Higher SD indicates lower assay precision",
    x = "Standard Deviation of Fmax [RFU]",
    y = "Project",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/07_sd_by_project.png",
       width = 12,  
       height = 4.7)

p_sd_by_project

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
<<<<<<< HEAD
sd_saa_stats <- sd_saa_data |>
  ungroup() |> 
  summarize(min = min(fmax_sd), 
            mean=mean(fmax_sd), 
            median=median(fmax_sd), 
            max=max(fmax_sd),
            IQR = quantile(fmax_sd,.75)- quantile(fmax_sd,.25),
             .by = project)
=======
sd_saa_stats <- sd_saa_data |>
  ungroup() |> 
  summarize(min = min(fmax_sd), 
            mean=mean(fmax_sd), 
            median=median(fmax_sd), 
            max=max(fmax_sd),
            IQR = quantile(fmax_sd,.75)- quantile(fmax_sd,.25),
             .by = project)
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Summary statistics:

Project Mean SD Median SD IQR
155 24,066 22,198 26,131
237 10,386 5,662 6,470
<<<<<<< HEAD

The 150-hour assay (155) has 2–4 times higher replicate variability than the 24-hour assay (237). This is one of the most important quality results in the entire analysis.

Instrument-Specific Variability

instruments_used <- aug_data |>
  filter(instrument %in% c(2, 4, 6)) |> 
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = instrument),
           position = "fill") + 
  scale_fill_manual(breaks = c("2","4","5","6","9","10"),
                    values = c("#da9283","#3C3C68","#C03221","#627634","#7A1F15","#94BFBE"),
                    name = "Instrument",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Ratio used instruments 2 4 and 6 ",
       subtitle = "The instrument used to analyze each observation",
       x = "Project",
       y = "Ratio")

instruments_used

=======

The 150-hour assay (155) has 2–4 times higher replicate variability than the 24-hour assay (237). This is one of the most important quality results in the entire analysis. Though, it still overlaps, and therefore its hard to make a strong conclusion.

Instrument-Specific Variability

instruments_used <- aug_data |>
  
  filter(instrument %in% c(2, 4, 6)) |> 
  ggplot(mapping = aes(x = project)) + 
  
  geom_bar(mapping = aes(fill = instrument),
           position = "fill") + 
  
  scale_fill_manual(breaks = c("2","4","5","6","9","10"),
                    values = c("#da9283","#3C3C68","#C03221","#627634","#7A1F15","#94BFBE"),
                    name = "Instrument",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Ratio used instruments 2, 4 and 6 ",
       subtitle = "The instrument used to analyze each observation",
       x = "Project",
       y = "Ratio")

instruments_used

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

We next examined whether any fluorescence reader (instruments 2, 4, 6) contributed disproportionately to the SD.

<<<<<<< HEAD
p_sd_by_instrument <- sd_saa_data |> 
  filter(instrument %in% c(2,4,6)) |> 
  
  ggplot(aes(
      x = fmax_sd, 
      y = instrument, 
      color = factor(instrument), 
      fill = factor(instrument)
  )) +
  
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ project, scales = "free_x") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Instrument-Specific Variability in Fmax",
    subtitle = "Comparison across three fluorescence readers",
    x = "SD of Fmax [RFU]",
    y = "Instrument",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/06_sd_by_instrument.png",
       width = 11,  
       height = 7)

p_sd_by_instrument

=======
p_sd_by_instrument <- sd_saa_data |> 
  filter(instrument %in% c(2,4,6)) |> 
  
  ggplot(aes(
      x = fmax_sd, 
      y = instrument, 
      color = factor(instrument), 
      fill = factor(instrument)
  )) +
  
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ project, scales = "free_x") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Instrument-Specific Variability in Fmax",
    subtitle = "Comparison across three fluorescence readers",
    x = "SD of Fmax [RFU]",
    y = "Instrument",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/07_sd_by_instrument.png",
       width = 11,  
       height = 7)

p_sd_by_instrument

>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
<<<<<<< HEAD
sd_instruments_stats <- sd_saa_data |> 
  ungroup() |> 
  summarize(mean=mean(fmax_sd), 
            median=median(fmax_sd), 
            IQR = quantile(fmax_sd,.75)- quantile(fmax_sd,.25),
             .by = c(project, instrument))
=======
sd_instruments_stats <- sd_saa_data |> 
  ungroup() |> 
  summarize(mean=mean(fmax_sd), 
            median=median(fmax_sd), 
            IQR = quantile(fmax_sd,.75)- quantile(fmax_sd,.25),
             .by = c(project, instrument))
>>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

Instrument-level statistics:

Project Instr. Mean SD Median SD IQR
155 2 30,015 28,999 30,174
237 2 8,384 4,482 4,953
155 4 21,469 20,743 21,746
237 4 8,788 4,594 5,483
155 6 20,190 17,562 24,950
237 6 13,107 8,357 6,809

Interpretation

  • No single instrument is consistently “worst” or “best.”

  • Instrument effects depend on the project, indicating interaction between protocol and hardware.

  • <<<<<<< HEAD
  • Critically, even the highest SD in Project 237 is lower than the lowest SD in Project 155.
    This confirms that protocol (not instrument) is the dominant source of variability.

Raw Fmax by Instruments and Replicates

To visualize replicate patterns across instruments:

p_fmax_raw_rep_inst <- aug_data |> 
  filter(instrument %in% c(2,4,6)) |> 
  
  ggplot(aes(
      x = fmax, 
      y = factor(rep), 
      color = factor(instrument),
      fill = factor(instrument)
  )) +
  
  geom_boxplot(alpha = 0.7) +
  facet_grid(instrument ~ project) +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Raw Fmax Across Instruments and Replicates",
    x = "Fmax [RFU]",
    y = "Replicate Number",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
    scale_x_continuous(labels = scales::label_comma())
ggsave(filename = "../results/06_fmax_raw_rep_inst.png",
       width = 10,  
       height = 5)   


p_fmax_raw_rep_inst

=======
  • Even the highest SD in Project 237 is lower than the lowest SD in Project 155.
    This confirms that protocol (not instrument) is the dominant source of variability.

  • Raw Fmax by Instruments and Replicates

    To visualize replicate patterns across instruments:

    p_fmax_raw_rep_inst <- aug_data |> 
      filter(instrument %in% c(2,4,6)) |> 
      
      ggplot(aes(
          x = fmax, 
          y = factor(rep), 
          color = factor(instrument),
          fill = factor(instrument)
      )) +
      
      geom_boxplot(alpha = 0.7) +
      facet_grid(instrument ~ project) +
      
      scale_color_manual(values = my_cols) +
      scale_fill_manual(values = my_cols) +
      
      labs(
        title = "Raw Fmax Across Instruments and Replicates",
        x = "Fmax [RFU]",
        y = "Replicate Number",
        caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
      ) +
      
      theme_custom +
        scale_x_continuous(labels = scales::label_comma())
    ggsave(filename = "../results/07_fmax_raw_rep_inst.png",
           width = 10,  
           height = 5)   
    
    
    p_fmax_raw_rep_inst

    >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
    <<<<<<< HEAD
    stats_instruments_project <- aug_data |>
      ungroup() |> 
      filter(instrument == 2 | instrument == 4 | instrument == 6) |> 
      summarize(min = min(fmax), 
                mean=mean(fmax), 
                median=median(fmax), 
                max=max(fmax),
                 .by = c(project, rep, instrument))
    =======
    stats_instruments_project <- aug_data |>
      ungroup() |> 
      filter(instrument == 2 | instrument == 4 | instrument == 6) |> 
      summarize(min = min(fmax), 
                mean=mean(fmax), 
                median=median(fmax), 
                max=max(fmax),
                 .by = c(project, rep, instrument))
    
    stats_instruments_project
    # A tibble: 18 × 7
       project rep   instrument   min    mean  median    max
       <chr>   <chr> <chr>      <dbl>   <dbl>   <dbl>  <dbl>
     1 155     1     2            209  93224.  99592. 184313
     2 155     2     2            214  90078   98395  171306
     3 155     3     2            216  88698.  94544. 171318
     4 237     1     6            493 142075. 158205  247730
     5 237     2     6            489 128143. 143678  235264
     6 237     3     6            497 125836. 141988  238974
     7 237     1     2            518 116011. 137888. 208848
     8 237     2     2            508 112391. 133614  202797
     9 237     3     2            521 114789. 134412  211862
    10 237     1     4            445 113159. 132906  250877
    11 237     2     4            422 109894. 130298  244977
    12 237     3     4            426 113289. 133050  257860
    13 155     1     4            290  66303.  69509  136842
    14 155     2     4            301  66400.  69292. 136781
    15 155     3     4            300  64469.  71845  136621
    16 155     1     6            265  63267.  68514  126190
    17 155     2     6            264  62577.  64985  127412
    18 155     3     6            246  62107.  68970  131732
    >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

    Across both projects, either replicate 1 or 3 typically gives the highest Fmax.
    This aligns with known SAA behavior:

    • Rep 1 often initiates early aggregation (“priming”).

    • Rep 3 often captures stabilized aggregation at late cycles.

    Instrument-specific medians confirm these patterns and align with earlier SD/IQR findings.

    <<<<<<< HEAD

    Instrument 2

    155 237
    Median (rep 1) 93781.0 137402.5
    Median (rep 2) 86402.0 133614.0
    Median (rep 3) 81850.0 134146.0

    Instrument 4

    155 237
    Median (rep 1) 66598.0 132876.0
    Median (rep 2) 65894.5 130309.5
    Median (rep 3) 66471.0 132749.0

    instrument 6

    155 237
    Median (rep 1) 63126.0 158205.0
    Median (rep 2) 56437.0 143678.0
    Median (rep 3) 63862.0 141988.0
    ======= >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337

    Overall Interpretation

    <<<<<<< HEAD

    Across all analyses, mean Fmax, replicate medians, IQR, SD, and instrument comparisons, Project 237 shows more precision and replicate reproducibility.

    Key conclusions:

    • Fmax magnitude: Differs between projects due to protocol differences; not directly comparable without normalization.

    • Replicate variability: Is substantially lower in the 24-hour assay (Project 237).

    • Standard deviation: Is 2–4 times lower in Project 237, making it markedly more stable.

    • Instrument effects: Are present but small compared with protocol effects.

    • Project 155’s 150-hour assay: Shows broad variability and inconsistent replicate behavior, reducing precision.

    Summary

    Project 237 (24-hour SAA) demonstrates:

    • Higher Fmax values

    • Much lower replicate variability

    • Stronger consistency across instruments

    • More coherent replicate patterns

    Project 155 (150-hour SAA) shows:

    • Lower Fmax (expected from protocol)

    • Substantially higher replicate noise

    • Greater instrument-dependent variation

    • Overall poorer measurement precision

    =======

    Across all analyses, mean Fmax, replicate medians, IQR, SD, and instrument comparisons, Project 237 shows less variability and replicate reproducibility.

    Key conclusions:

    • Fmax magnitude: Differs between projects due to protocol differences; not directly comparable without looking at protocol specific factors.

    • Replicate variability: Is substantially lower in the 24-hour assay (Project 237).

    • Standard deviation: Is 2–4 times lower in Project 237, making it markedly more stable.

    • Instrument effects: Are present but small compared with protocol effects. The differences is project based, not instrument based.

    • Project 237 seems to be more consistent, but because of their overlaps in standard deviation, it is hard to make a strong conclusion.

    08_analysis_4

    Load libraries and data

    library(tidyverse)
    library(readr)
    library(modelr)
    aug_data <- read_tsv("../data/03_dat_aug.tsv")

    Defining colors and theme

    my_cols <- c("#3C3C68","#C03221","#6D8E64","#94BFBE","#545E75",  "#7A1F15",
                 "#da9283" , "#474B24")
    
    theme_custom <- theme_minimal(base_size = 14) +
      
      theme(
        plot.title = element_text(face = "bold", 
                                  size = 13, 
                                  margin = margin(b = 7)
                                  ),
        
        plot.subtitle = element_text(size = 10, 
                                     margin = margin(b = 12)
                                     ),
        
        plot.caption = element_text(size = 8, 
                                    margin = margin(t = 20)
                                    ),
          
        axis.title.x = element_text(size = 10, 
                                    margin = margin(t = 10)
                                    ),
        
        axis.title.y = element_text(size = 10, 
                                    margin = margin(r = 10)
                                    ),
    
        axis.text.x = element_text(size = 10, 
                                   margin = margin(t = 5)
                                   ),
        
        axis.text.y = element_text(size = 10, 
                                   margin = margin(r = 5)
                                   ),
            )

    Analysis

    The purpose of this analysis is to see whether the test gets better at subsequent visits or rather: does the test become better further into disease progression? We investigate this by looking at mean fmax, which of course is not strictly related to positive tests, over the calculated days from baseline. This days from baseline of course does not correspond to overall disease progression but is a measure of time for each individual patient. However, we will still try to investigate it with modeling. First, an overall scatter plot.

    aug_data |> 
      ggplot(aes(x = days_from_baseline, 
                   y = fmax_mean)) +
      geom_point(aes(color = saa_result,
                     shape = as.factor(project))) +
      labs(title = "Mean fmax over time",
           x = "Days from baseline",
           y = "Mean fmax (RFU)",
           shape = "Project",
           color = "SAA status",
           caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
      
      scale_color_manual(values = my_cols) +
      scale_fill_manual(values = my_cols) +
      
      theme_minimal() +
      theme_custom

      ggsave(filename = "../results/08_fmax_over_time.png",
           height = 5,
           width = 10)

    As the fmax from the projects is not comparable, we will only look at one for the modelling. This will be project 155 as project 237 only have observations close to baseline.

    Below a linear model is fitted as well as a residual plot.

    sim1_mod <- lm(fmax_mean ~ days_from_baseline, 
                   data = filter(aug_data, project == 237))
    
    coef(sim1_mod)
           (Intercept) days_from_baseline 
          1.183637e+05       3.316171e+00 
    grid <- aug_data|> 
      filter(project == 237) |> 
      data_grid(days_from_baseline)
    grid <- grid |> 
      add_predictions(sim1_mod) 
    aug_data |> 
      filter(project == 237) |> 
      ggplot(aes(x = days_from_baseline, 
                 y = fmax_mean)) +
      geom_point(aes(color = saa_result)) +
      geom_line(aes(y = pred), 
                data = grid, 
                colour = "red", 
                size = 1) + 
      
      labs(title = "Mean fmax over time for project 237 with linear fit",
           x = "Days from baseline",
           y = "Mean fmax (RFU)",
           shape = "Project",
           color = "SAA status",
           caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
      
      scale_color_manual(values = my_cols) +
      scale_fill_manual(values = my_cols) +
      
      theme_minimal() +
      theme_custom
    Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
    ℹ Please use `linewidth` instead.

    ggsave(filename = "../results/08_fmax_over_time_lm.png",
           height = 5,
           width = 12)
    resid <- filter(aug_data, project == 237) |> 
      add_residuals(sim1_mod)
    
    ggplot(resid, aes(days_from_baseline, 
                      resid)) + 
      geom_ref_line(h = 0) +
      geom_point(color = "#545E75") +
      theme_minimal() +
      
      labs(title = "Residual plot for lm of fmax over time for project 237",
           x = "Days from baseline",
           y = "Resid",
           caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
      
      theme_custom +
      scale_y_continuous(labels = scales::label_comma())

    ggsave(filename = "../results/08_fmax_over_time_resid.png",
           height = 5,
           width = 10)

    This really tells us nothing. Fitting to a linear model does not make sense, as expected from the scatter plot, which showed no clear pattern.

    This may be because that days from baseline is a bad indicator for disease progression, so for this reason we want to look at the development for each patient. Only a few patients had multiple samples taken, so first we look at the 12 patients with the most patients.

    top_12 <- aug_data |> 
      count(patient_number, sort = TRUE) |> 
      head(12)
    
    aug_data |> 
      filter(patient_number %in% top_12$patient_number) |> 
        ggplot(aes(x = days_from_baseline, 
                   y = fmax_mean)) +
      
      geom_line(color = "gray") +
      
      geom_point(aes(color = saa_result,
                     shape = as.factor(project))) +
      
      facet_wrap(~ patient_number) +
      theme_minimal() +
      
      scale_color_manual(values = my_cols) +
      scale_fill_manual(values = my_cols) +
      
      scale_x_continuous(n.breaks = 3) +
      
      labs(title = "Mean fmax over time for the 12 patients with the most visits",
           x = "Days from baseline",
           y = "Mean fmax (RFU)",
           shape = "Project",
           color = "SAA status",
           caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
      
      theme_custom

    ggsave(filename = "../results/08_fmax_over_time_top12.png",
           height = 5,
           width = 12)

    This last bit of code allows looking at a random sample from the top 100 of patients with the most visits.

    top_100 <- aug_data |> 
      count(patient_number, sort = TRUE) |> 
      head(100)
    
    sample <- top_100 |> 
      sample_n(20)
    
    aug_data |> 
      filter(patient_number %in% sample$patient_number) |> 
      ggplot(aes(x = days_from_baseline, 
                   y = fmax_mean)) +
      
      geom_line(color = "gray") +
    
      geom_point(aes(color = saa_result,
                     shape = as.factor(project))) +
      theme_minimal() +
      scale_x_continuous(n.breaks = 3) +
      facet_wrap(~ patient_number) +
      scale_color_manual(values = my_cols) +
      scale_fill_manual(values = my_cols) +
      
      labs(title = "Mean fmax over time for random patients with the most visits",
           x = "Days from baseline",
           y = "Mean fmax (RFU)",
           shape = "Project",
           color = "SAA status",
           caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
      
      theme_custom

    >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337
    ======= } else { // See if we can fetch a full url (with no hash to target) // This is a special case and we should probably do some content thinning / targeting fetch(url) .then(res => res.text()) .then(html => { const parser = new DOMParser(); const htmlDoc = parser.parseFromString(html, "text/html"); const note = htmlDoc.querySelector('main.content'); if (note !== null) { // This should only happen for chapter cross references // (since there is no id in the URL) // remove the first header if (note.children.length > 0 && note.children[0].tagName === "HEADER") { note.children[0].remove(); } const html = processXRef(null, note); instance.setContent(html); } }).finally(() => { instance.enable(); instance.show(); }); } }, function(instance) { }); } let selectedAnnoteEl; const selectorForAnnotation = ( cell, annotation) => { let cellAttr = 'data-code-cell="' + cell + '"'; let lineAttr = 'data-code-annotation="' + annotation + '"'; const selector = 'span[' + cellAttr + '][' + lineAttr + ']'; return selector; } const selectCodeLines = (annoteEl) => { const doc = window.document; const targetCell = annoteEl.getAttribute("data-target-cell"); const targetAnnotation = annoteEl.getAttribute("data-target-annotation"); const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation)); const lines = annoteSpan.getAttribute("data-code-lines").split(","); const lineIds = lines.map((line) => { return targetCell + "-" + line; }) let top = null; let height = null; let parent = null; if (lineIds.length > 0) { //compute the position of the single el (top and bottom and make a div) const el = window.document.getElementById(lineIds[0]); top = el.offsetTop; height = el.offsetHeight; parent = el.parentElement.parentElement; if (lineIds.length > 1) { const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]); const bottom = lastEl.offsetTop + lastEl.offsetHeight; height = bottom - top; } if (top !== null && height !== null && parent !== null) { // cook up a div (if necessary) and position it let div = window.document.getElementById("code-annotation-line-highlight"); if (div === null) { div = window.document.createElement("div"); div.setAttribute("id", "code-annotation-line-highlight"); div.style.position = 'absolute'; parent.appendChild(div); } div.style.top = top - 2 + "px"; div.style.height = height + 4 + "px"; div.style.left = 0; let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter"); if (gutterDiv === null) { gutterDiv = window.document.createElement("div"); gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter"); gutterDiv.style.position = 'absolute'; const codeCell = window.document.getElementById(targetCell); const gutter = codeCell.querySelector('.code-annotation-gutter'); gutter.appendChild(gutterDiv); } gutterDiv.style.top = top - 2 + "px"; gutterDiv.style.height = height + 4 + "px"; } selectedAnnoteEl = annoteEl; } }; const unselectCodeLines = () => { const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"]; elementsIds.forEach((elId) => { const div = window.document.getElementById(elId); if (div) { div.remove(); } }); selectedAnnoteEl = undefined; }; // Handle positioning of the toggle window.addEventListener( "resize", throttle(() => { elRect = undefined; if (selectedAnnoteEl) { selectCodeLines(selectedAnnoteEl); } }, 10) ); function throttle(fn, ms) { let throttle = false; let timer; return (...args) => { if(!throttle) { // first call gets through fn.apply(this, args); throttle = true; } else { // all the others get throttled if(timer) clearTimeout(timer); // cancel #2 timer = setTimeout(() => { fn.apply(this, args); timer = throttle = false; }, ms); } }; } // Attach click handler to the DT const annoteDls = window.document.querySelectorAll('dt[data-target-cell]'); for (const annoteDlNode of annoteDls) { annoteDlNode.addEventListener('click', (event) => { const clickedEl = event.target; if (clickedEl !== selectedAnnoteEl) { unselectCodeLines(); const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active'); if (activeEl) { activeEl.classList.remove('code-annotation-active'); } selectCodeLines(clickedEl); clickedEl.classList.add('code-annotation-active'); } else { // Unselect the line unselectCodeLines(); clickedEl.classList.remove('code-annotation-active'); } }); } const findCites = (el) => { const parentEl = el.parentElement; if (parentEl) { const cites = parentEl.dataset.cites; if (cites) { return { el, cites: cites.split(' ') }; } else { return findCites(el.parentElement) } } else { return undefined; } }; var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]'); for (var i=0; i >>>>>>> 83e0cb80d6084ba6eed8292e6bc88abd6d160337